#setwd('/afs/inf.ed.ac.uk/user/s17/s1725186/Documents/PhD-InitialExperiments/SFARI_genes/R_Markdowns/')
library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(dendextend)
library(RColorBrewer) ; library(viridis) ; library(ggalluvial)
library(knitr)
SFARI_colour_hue = function(r) {
pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}
Load old and new SFARI scores
# Old SFARI Genes scoring
SFARI_genes_old = read_csv('./../../../Models/FirstPUModel/Data/SFARI/SFARI_genes_08-29-2019_with_ensembl_IDs.csv')
SFARI_genes_old = SFARI_genes_old[!duplicated(SFARI_genes_old$`gene-symbol`),]
# New SFARI Genes scoring
SFARI_genes_new = read_csv('./../Data/SFARI_genes_01-03-2020.csv')
SFARI_genes = SFARI_genes_old %>% mutate(gene_score_old = `gene-score`, syndromic_old = syndromic) %>%
dplyr::select(`gene-symbol`, gene_score_old, syndromic_old) %>%
full_join(SFARI_genes_new %>%
mutate(gene_score_new = `gene-score`, syndromic_new = syndromic) %>%
dplyr::select(`gene-symbol`, gene_score_new, syndromic_new), by='gene-symbol')
Syndromic
The syndromic category includes mutations that are associated with a substantial degree of increased risk and consistently linked to additional characteristics not required for an ASD diagnosis. If there is independent evidence implicating a gene in idiopathic ASD, it will be listed as “#S” (e.g., 2S, 3S, etc.). If there is no such independent evidence, the gene will be listed simply as “S.”
We considered a rigorous statistical comparison between cases and controls, yielding genome-wide statistical significance, with independent replication, to be the strongest possible evidence for a gene. These criteria were relaxed slightly for category 2.
Category 1 (High Confidence)
1.1: Genes in this category require evidence of recurrent and convincing mutations accompanied by a rigorous statistical comparison with the mutation frequency in controls, confirmed via independent replication. Full sequencing of a comparable number of cases and controls is also required. “Convincing” is defined as “likely to be functional,” or showing perfect segregation of mutations and phenotype in a large pedigree
1.2: Results from association studies must reach genome-wide significance, uniquely implicating a single gene, and be independently replicated, or reach genome-wide significance via meta-analysis of all current association studies. For genome-wide significant variants in an intergenic region, a nearby flanking gene would be included if it’s the only gene in strong linkage disequilibrium (LD) with the intergenic variant, or if the variant is also associated with altered expression of a particular flanking gene (or another line of strong evidence implicating this gene).
Category 2 (Strong Candidate)
2.1: These genes are rare mutations that are recurrent and convincing, accompanied by a rigorous statistical comparison with the mutation frequency in controls. Independent replication is not required. Full sequencing of a comparable number of cases and controls is required. “Convincing” is defined as “likely to be functional,” or showing perfect segregation of mutations and phenotype in a large pedigree. Rare de novo variants, likely to be disruptive, in three or more unrelated cases would qualify here
2.2: Results from association studies must reach genome-wide significance, uniquely implicating a single gene, but with no independent replication. For genome-wide significant variants in an intergenic region, a nearby flanking gene would be included if it’s the only gene in strong LD with the intergenic variant, or if the variant is also associated with altered expression of a particular flanking gene (or another line of strong evidence implicating this gene)
2.3: Genes in this category require a consistently replicated association of the same allele, falling short of genome wide significance, and must be accompanied by evidence that the risk variant has a relevant functional effect in humans. “Consistently replicated” means replication of the same variant in each follow-up association study that is appropriately powered and involves a population of the same ancestry, or it has been found inconsistent but overall significant by meta-analysis. In this regard, “gene-based” tests of association are acceptable as evidence of an association if performed in the original study, but would not be considered as evidence simply to “rescue” a subsequent study that finds a variant association that is different from the associated variant in the original study
The literature is replete with relatively small studies of candidate genes, using either common or rare variant approaches, which do not reach the criteria set out for categories 1 and 2. Genes that had two such lines of supporting evidence were placed in category 3, and those with one line of evidence were placed in category 4. Some additional lines of “accessory evidence” (indicated as “acc” in the score cards) could also boost a gene from category 4 to 3.
Category 3 (Suggestive Evidence)
3.1: Genes that meet at least two of criteria 4.1, 4.2, and 4.3, or meet one of those criteria and at least one line of category 4 accessory evidence are included here
3.2: These genes meet category 2.3 criteria, but without accompanying functional evidence
3.3: Rare de novo variants, likely to be disruptive, in two or more unrelated cases are placed in this category
Category 4 (Minimal Evidence)
4.1: Any gene in an ASD-associated multi-genic CNV (including syndromic) for which there is no other independent evidence is included here
4.2: Genes proximal to genome-wide significant intergenic variants that don’t meet category 1 or 2 criteria are placed here
4.3: This category contains any significant, convincing, but unreplicated association study data, along with any instances of multiple but inconsistent reports of association that are not overall significant by meta-analysis
4.4: Genes with a series of two or more putative mutations identified (e.g., non-synonymous substitutions, single-gene deletion, duplication, disruption by translocation) for which there is not rigorous statistical comparison with controls are included here
4.5: This category contains single rare de novo variants, likely to be disruptive
The presence of certain accessory evidence may raise a gene from category 4 to category 3. Such evidence includes:
Altered expression or function in ASD cases vs. controls in any tissue, whether as a function of genotype or not
Strong evidence for involvement in a related phenotype or disorder, specifically including ADHD, ASD-related endophenotypes (e.g., language impairment), bipolar disorder, epilepsy, intellectual disability, schizophrenia
Genes shown to be associated with ASD risk via genetic epistasis with another gene
The list of genes in SFARI Gene is inclusive, and as such there are genes that have been implicated solely by evidence in model organisms or other evidence of a marginal nature. These genes were placed in category 5, as they have not yet been rigorously tested in a human cohort. Category 6 is for those genes that have been tested in a human cohort, but the weight of the evidence argues against a role in autism.
Category 5 (Hypothesized)
5.1: This category includes genes for which the only evidence comes from studies of model organisms, without statistical or genetic support in human studies
5.2: Genes in a region of linkage with no unique evidence for that gene versus others nearby are included here
5.3: Genes in category 5.3 are shown only to functionally interact with category 1-3 ASD candidate genes
5.4: Genes with a single rare variant observed in a single ASD case/family are placed here
Category 6 (Not Supported)
Syndromic
The syndromic category includes mutations that are associated with a substantial degree of increased risk and consistently linked to additional characteristics not required for an ASD diagnosis. If there is independent evidence implicating a gene in idiopathic ASD, it will be listed as “#S” (e.g., 2S, 3S). If there is no such independent evidence, the gene will be listed simply as “S”.
Category 1 (High Confidence)
Genes in this category are all found on the SPARK gene list. Each of these genes has been clearly implicated in ASD—typically by the presence of at least three de novo likely-gene-disrupting mutations being reported in the literature—and such mutations identified in the sequencing of the SPARK cohort are typically returned to the participants. Some of these gene meet the most rigorous threshold of genome-wide significance; all at least meet a threshold false discovery rate of < 0.1
Category 2 (Strong Candidate)
Genes with two reported de novo likely-gene-disrupting mutations
A gene uniquely implicated by a genome-wide association study, either reaching genome-wide significance or, if not, consistently replicated and accompanied by evidence that the risk variant has a functional effect
Category 3 (Suggestive Evidence)
Genes with a single reported de novo likely-gene-disrupting mutation
Evidence from a significant but unreplicated association study, or a series of rare inherited mutations for which there is not a rigorous statistical comparison with controls
Announcement
In order to simplify the gene scoring system we have decided to reduce the number of categories from seven to four. The ‘S’ (syndromic) category will remain, to include genes associated with medical syndromes that are frequently accompanied by autism diagnoses. There will then be three categories for genes associated with ‘idiopathic’ autism
The new category 1 (high confidence) genes will be taken from the SPARK gene list, which are genes for which there is sufficient evidence in the literature that they are deemed ‘returnable’ results to families in a clinical setting. In practice this means three de novo likely gene-disrupting mutations in individuals with an ASD diagnosis. All of these are in either category 1 or 2 under the previous scoring system
The new category 2 (strong candidate) genes will be drawn from the previous category 3
The new category 3 (suggestive evidence) genes will be drawn from the previous category 4
The previous categories 5 and 6 will be eliminated
My Notes
The mapping between scores is not as clean as it is said to be in the announcement (1st alluvial diagram). Perhaps there was an update in the scores after the version I have and that’s where the noise is coming from? It would be interesting to see how much the scores change from one update from another, but I cannot find the different versions of the scores on the website
The description for the Syndromic tag didn’t change at all, but the genes with the tag did change a lot (2nd alluvial diagram)
New score descriptions are based only on mutations, they don’t mention gene expression experiments any more
The role of gene expression experiments in the original scoring system was only as “accessory evidence” that would allow a gene with a score of 4 to be “upgraded” to score 3, and since they mapped previous score 4 to new score 3 and previous score 3 to new score 2, technically, this is still part of the scoring criteria, but the fact that is no longer mentioned anywhere is a but discouraging
Why is the role of gene expression experiments so small? Why is this the first time I’m questioning this?!
The new score 1 seems to come from a mix of scores 1, 2, 3 and some new
Score 2 consists of genes with previous scores 2 and 3
Score 3 is almost completely previous score 4
Scores 5 and 6 have disappeared
# Create connection data frame with a list of flows and their intensity
summary_info = SFARI_genes %>% group_by(gene_score_old, gene_score_new) %>% tally %>%
replace_na(list(gene_score_old='Not included', gene_score_new='Not included')) %>%
ungroup(gene_score_old, gene_score_new)
summary_info %>% ggplot(aes(y=n, axis1=gene_score_old, axis2=gene_score_new)) +
geom_alluvium(discern = TRUE) +
geom_stratum(discern = TRUE, color = 'white', width = 0.2,
fill=c('#999999','#59B9C9','#62CCA6','#8CC83F','#E8E328','#FFB100','#FF7631',
'#999999','#E8E328','#FFB100','#FF7631')) +
geom_text(stat = 'stratum', discern = FALSE, infer.label = TRUE, color='white') +
scale_x_discrete(limits=c('Old','New'), expand=c(0, 0)) + ylab('Number of Genes') +
ggtitle('Changes in SFARI scores by gene') + theme_minimal()
Around half of the genes that were previously labeled as syndromic are not longer labelled
Around half of the the genes now labeled as syndromic weren’t in the dataset before
Very big differences for having no change in the tag’s definition …
summary_info = SFARI_genes %>% group_by(syndromic_old, syndromic_new) %>% tally %>%
replace_na(list(syndromic_old='None', syndromic_new='None')) %>%
ungroup(syndromic_old, syndromic_new) %>%
mutate(syndromic_old_ID = as.numeric(as.factor(syndromic_old))-1,
syndromic_new_ID = as.numeric(as.factor(syndromic_new))+2)
plot_ly(type = 'sankey',
node = list(label = rep(c('Not Syndromic', 'Syndromic' ,'Not included'),2),
color = rep(c('#00b3b3','#dc2641','#999999'),2),
pad = 5,
thickness = 40,
line = list(width = 0)),
link = list(source = summary_info$syndromic_old_ID,
target = summary_info$syndromic_new_ID,
value = summary_info$n)
) %>%
layout(title = 'Changes in SFARI syndromic tag by gene', font = list(size = 10))
library(plotly)
summary_info = SFARI_genes %>% group_by(gene_score_old, gene_score_new) %>% tally %>%
replace_na(list(gene_score_old='None', gene_score_new='None')) %>%
ungroup(gene_score_old, gene_score_new) %>%
mutate(gene_score_old_ID = as.numeric(as.factor(gene_score_old))-1,
gene_score_new_ID = as.numeric(as.factor(gene_score_new))+6)
### Doesn't work!!!
#pos_x = c(rep(0,7), rep(1,4))
#pos_y_old = cumsum(as.vector(table(SFARI_genes$gene_score_old, useNA='ifany')))/nrow(SFARI_genes)
#pos_y_old = pos_y_old - min(pos_y_old)
#pos_y_new = cumsum(as.vector(table(SFARI_genes$gene_score_new, useNA='ifany')))/nrow(SFARI_genes)
#pos_y_new = pos_y_new - min(pos_y_new)
plot_ly(type = 'sankey', arrangement = 'snap',
node = list(label = c('1','2','3','4','5','6','Not included','1','2','3','Not included'),
color = SFARI_colour_hue(c(1:7,1:3,7)),
#x = pos_x,
#y = c(pos_y_old, pos_y_new),
pad = 5,
thickness = 40,
line = list(width = 0)),
link = list(source = summary_info$gene_score_old_ID,
target = summary_info$gene_score_new_ID,
value = summary_info$n)
) %>%
layout(title = 'Changes in SFARI scores by gene', font = list(size = 10))